In [ ]:
import sys
sys.path.append('../ML/')
In [ ]:
import pandas as pd

years = [year for year in range(2018, 2023)] # start year to 2022
location = '臺中'
dfs = []
for year in years:
    df = pd.read_csv(f'./concat-data/{location}/{year}.csv')
    dfs.append(df)

df = pd.concat(dfs, ignore_index=True) # all data from 2018 to 2022
# df = clean(df) # clean df, e.g., nan handling, rename columns, ...
df
Out[ ]:
egg_TaijinPrice Duckegg_TNN_TaijinPrice 入中雞雛數 產蛋隻數 均 日 產蛋箱數 淘汰隻數 目 前 換羽隻數 Date WhiteChickQuantity RedChickQuantity ImportChickenQuantity 玉米粒 黃豆粉 玉米粉 脫殼豆粉 高蛋白豆粉 StationName AirTemperatureMean AirTemperatureMax Precipitation
0 24.5 34.0 1160830.0 31170982.0 114908.0 866824.0 633700.0 2018-01-01 245600 84800 0.0 6.10 12.15 6.65 13.3 13.21 臺中 17.3 27.2 0.0
1 24.5 34.0 1160830.0 31170982.0 114908.0 866824.0 633700.0 2018-01-02 245600 84800 0.0 6.10 12.15 6.65 13.3 13.21 臺中 17.3 27.2 0.0
2 26.5 34.0 1160830.0 31170982.0 114908.0 866824.0 633700.0 2018-01-03 245600 84800 0.0 6.10 12.15 6.65 13.3 13.21 臺中 17.3 27.2 0.0
3 26.5 34.0 1160830.0 31170982.0 114908.0 866824.0 633700.0 2018-01-04 245600 84800 0.0 6.10 12.15 6.63 13.3 13.21 臺中 17.3 27.2 4.0
4 26.5 34.0 1160830.0 31170982.0 114908.0 866824.0 633700.0 2018-01-05 245600 84800 0.0 6.10 12.15 6.63 13.3 13.20 臺中 17.3 27.2 0.5
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1821 40.5 43.0 1592549.0 35037318.0 122636.0 1203140.0 990135.0 2022-12-27 387890 83670 NaN 10.85 19.90 11.60 21.1 21.00 臺中 17.8 28.1 0.0
1822 40.5 43.0 1592549.0 35037318.0 122636.0 1203140.0 990135.0 2022-12-28 387890 83670 NaN 11.05 19.90 11.68 21.1 21.00 臺中 17.8 28.1 0.0
1823 40.5 43.0 1592549.0 35037318.0 122636.0 1203140.0 990135.0 2022-12-29 353700 91500 NaN 11.05 19.90 11.78 21.1 21.00 臺中 17.8 28.1 T
1824 40.5 43.0 1592549.0 35037318.0 122636.0 1203140.0 990135.0 2022-12-30 353700 91500 NaN 11.20 19.90 11.78 21.1 21.00 臺中 17.8 28.1 0.0
1825 40.5 43.0 1592549.0 35037318.0 122636.0 1203140.0 990135.0 2022-12-31 353700 91500 NaN 11.20 19.90 11.78 21.1 21.00 臺中 17.8 28.1 0.0

1826 rows × 20 columns

In [ ]:
df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True)

df = df.drop(columns=['StationName'])
# rename columns
new_columns = {
    'egg_TaijinPrice':"EggPrice", 
    'Duckegg_TNN_TaijinPrice': "DuckEggPrice", 
    '入中雞雛數':"NumRuChung",
    '產蛋隻數':"NumOfChickLayingEggs",
    '均  日     產蛋箱數':"BoxesOfEggsPerDay", 
    '淘汰隻數':"NumDisuse", 
    '目  前         換羽隻數':"NumMoulting", 
    'WhiteChickQuantity':"WhiteChickQuantity",
    'RedChickQuantity':"RedChickQuantity", 
    'ImportChickenQuantity':"ImportChickenQuantity", 
    ' 玉米粒':"Corn", 
    ' 黃豆粉 ':"SoyBeanFlour", 
    '玉米粉 ':"CornFlour",
    '脫殼豆粉':"DehulledBeanFlour", 
    '高蛋白豆粉':"HighProteinBeanFlour", 
    'AirTemperatureMean':"AirTemperatureMean", 
    'AirTemperatureMax':"AirTemperatureMax",
    'Precipitation':"Precipitation"
}
df = df.rename(columns=new_columns)

# 將所有資料轉成數字,不是數字的資料會變成 NaN
df = df.apply(pd.to_numeric, errors='coerce')

# 空值全部填 0
df = df.fillna(float(0))
df
Out[ ]:
EggPrice DuckEggPrice NumRuChung NumOfChickLayingEggs BoxesOfEggsPerDay NumDisuse NumMoulting WhiteChickQuantity RedChickQuantity ImportChickenQuantity Corn SoyBeanFlour CornFlour DehulledBeanFlour HighProteinBeanFlour AirTemperatureMean AirTemperatureMax Precipitation
Date
2018-01-01 24.5 34.0 1160830.0 31170982.0 114908.0 866824.0 633700.0 245600 84800 0.0 6.10 12.15 6.65 13.3 13.21 17.3 27.2 0.0
2018-01-02 24.5 34.0 1160830.0 31170982.0 114908.0 866824.0 633700.0 245600 84800 0.0 6.10 12.15 6.65 13.3 13.21 17.3 27.2 0.0
2018-01-03 26.5 34.0 1160830.0 31170982.0 114908.0 866824.0 633700.0 245600 84800 0.0 6.10 12.15 6.65 13.3 13.21 17.3 27.2 0.0
2018-01-04 26.5 34.0 1160830.0 31170982.0 114908.0 866824.0 633700.0 245600 84800 0.0 6.10 12.15 6.63 13.3 13.21 17.3 27.2 4.0
2018-01-05 26.5 34.0 1160830.0 31170982.0 114908.0 866824.0 633700.0 245600 84800 0.0 6.10 12.15 6.63 13.3 13.20 17.3 27.2 0.5
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2022-12-27 40.5 43.0 1592549.0 35037318.0 122636.0 1203140.0 990135.0 387890 83670 0.0 10.85 19.90 11.60 21.1 21.00 17.8 28.1 0.0
2022-12-28 40.5 43.0 1592549.0 35037318.0 122636.0 1203140.0 990135.0 387890 83670 0.0 11.05 19.90 11.68 21.1 21.00 17.8 28.1 0.0
2022-12-29 40.5 43.0 1592549.0 35037318.0 122636.0 1203140.0 990135.0 353700 91500 0.0 11.05 19.90 11.78 21.1 21.00 17.8 28.1 0.0
2022-12-30 40.5 43.0 1592549.0 35037318.0 122636.0 1203140.0 990135.0 353700 91500 0.0 11.20 19.90 11.78 21.1 21.00 17.8 28.1 0.0
2022-12-31 40.5 43.0 1592549.0 35037318.0 122636.0 1203140.0 990135.0 353700 91500 0.0 11.20 19.90 11.78 21.1 21.00 17.8 28.1 0.0

1826 rows × 18 columns

In [ ]:
df_shift = df.loc['2018-07-01':'2022-12-31']
df_shift
Out[ ]:
EggPrice DuckEggPrice NumRuChung NumOfChickLayingEggs BoxesOfEggsPerDay NumDisuse NumMoulting WhiteChickQuantity RedChickQuantity ImportChickenQuantity Corn SoyBeanFlour CornFlour DehulledBeanFlour HighProteinBeanFlour AirTemperatureMean AirTemperatureMax Precipitation
Date
2018-07-01 27.5 32.0 1548978.0 29964977.0 108553.0 861380.0 687628.0 372700 42600 21060.0 6.70 13.25 7.32 14.35 14.21 28.6 35.4 13.0
2018-07-02 27.5 32.0 1548978.0 29964977.0 108553.0 861380.0 687628.0 372700 42600 21060.0 6.70 13.25 7.32 14.35 14.21 28.6 35.4 67.5
2018-07-03 27.5 32.0 1548978.0 29964977.0 108553.0 861380.0 687628.0 372700 42600 21060.0 6.70 13.20 7.30 14.30 14.14 28.6 35.4 46.0
2018-07-04 27.5 34.0 1548978.0 29964977.0 108553.0 861380.0 687628.0 372700 42600 21060.0 6.70 13.15 7.28 14.30 14.11 28.6 35.4 31.0
2018-07-05 27.5 34.0 1548978.0 29964977.0 108553.0 861380.0 687628.0 372700 42600 21060.0 6.65 13.15 7.23 14.30 14.11 28.6 35.4 13.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2022-12-27 40.5 43.0 1592549.0 35037318.0 122636.0 1203140.0 990135.0 387890 83670 0.0 10.85 19.90 11.60 21.10 21.00 17.8 28.1 0.0
2022-12-28 40.5 43.0 1592549.0 35037318.0 122636.0 1203140.0 990135.0 387890 83670 0.0 11.05 19.90 11.68 21.10 21.00 17.8 28.1 0.0
2022-12-29 40.5 43.0 1592549.0 35037318.0 122636.0 1203140.0 990135.0 353700 91500 0.0 11.05 19.90 11.78 21.10 21.00 17.8 28.1 0.0
2022-12-30 40.5 43.0 1592549.0 35037318.0 122636.0 1203140.0 990135.0 353700 91500 0.0 11.20 19.90 11.78 21.10 21.00 17.8 28.1 0.0
2022-12-31 40.5 43.0 1592549.0 35037318.0 122636.0 1203140.0 990135.0 353700 91500 0.0 11.20 19.90 11.78 21.10 21.00 17.8 28.1 0.0

1645 rows × 18 columns

In [ ]:
from datetime import datetime, timedelta

days_befores = [30, 60, 90, 120, 150, 180]
features_to_extend = [
    'NumRuChung', 
    'NumOfChickLayingEggs', 
    'BoxesOfEggsPerDay', 
    'NumDisuse', 
    'NumMoulting', 
    'WhiteChickQuantity', 
    'RedChickQuantity', 
    'ImportChickenQuantity', 
    'Corn', 
    'SoyBeanFlour',
    'CornFlour', 
    'DehulledBeanFlour', 
    'HighProteinBeanFlour',
]
features_not_extended = [elem for elem in df_shift.columns if elem not in features_to_extend]

extend_columns = {feature: {day: [] for day in days_befores} for feature in features_to_extend}

def date_days_before(input_date, feature):
    # Convert input_date to a datetime object
    input_datetime = datetime.strptime(input_date, "%Y-%m-%d")

    for days_before in days_befores:
        result_datetime = input_datetime - timedelta(days=days_before) # Calculate some days before

        past_date = result_datetime.strftime("%Y-%m-%d") # Format and return the result as a string

        past_data = df.loc[past_date][feature] # find that data

        extend_columns[feature][days_before].append(past_data)


for idx in df_shift.index:
    date = str(idx)[:10] # idx = 2022-10-15 00:00:00, we only need the date
    for feat in extend_columns.keys():
        date_days_before(date, feat)
In [ ]:
df_extended = df_shift.copy(deep=True)
extended_columns = [] # for plotting 

for feat, dict in extend_columns.items():
    for days, list in dict.items():
        extend_col_name = f'{feat}_{int(days/30)}'
        df_extended[extend_col_name] = list
        extended_columns.append(extend_col_name)

df_shift = df_extended

# remove the original columns of the extended ones
df_shift = df_shift.drop(features_to_extend, axis=1)

df_shift
Out[ ]:
EggPrice DuckEggPrice AirTemperatureMean AirTemperatureMax Precipitation NumRuChung_1 NumRuChung_2 NumRuChung_3 NumRuChung_4 NumRuChung_5 ... DehulledBeanFlour_3 DehulledBeanFlour_4 DehulledBeanFlour_5 DehulledBeanFlour_6 HighProteinBeanFlour_1 HighProteinBeanFlour_2 HighProteinBeanFlour_3 HighProteinBeanFlour_4 HighProteinBeanFlour_5 HighProteinBeanFlour_6
Date
2018-07-01 27.5 32.0 28.6 35.4 13.0 1352635.0 1555965.0 1305120.0 1445970.0 1017671.0 ... 14.8 14.4 13.4 13.3 14.75 14.76 14.70 14.16 13.38 13.21
2018-07-02 27.5 32.0 28.6 35.4 67.5 1352635.0 1555965.0 1305120.0 1445970.0 1017671.0 ... 14.8 14.5 13.4 13.3 14.75 14.73 14.67 14.25 13.35 13.21
2018-07-03 27.5 32.0 28.6 35.4 46.0 1352635.0 1555965.0 1305120.0 1445970.0 1017671.0 ... 14.8 14.5 13.4 13.3 14.75 14.78 14.67 14.25 13.35 13.21
2018-07-04 27.5 34.0 28.6 35.4 31.0 1352635.0 1555965.0 1305120.0 1445970.0 1017671.0 ... 14.8 14.5 13.4 13.3 14.75 14.78 14.67 14.40 13.33 13.20
2018-07-05 27.5 34.0 28.6 35.4 13.0 1352635.0 1555965.0 1305120.0 1445970.0 1017671.0 ... 14.8 14.5 13.4 13.3 14.75 14.70 14.67 14.40 13.33 13.20
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2022-12-27 40.5 43.0 17.8 28.1 0.0 1547765.0 1482554.0 1912212.0 1654430.0 1642580.0 ... 20.5 19.9 19.2 23.6 20.90 20.45 20.45 19.73 18.70 18.70
2022-12-28 40.5 43.0 17.8 28.1 0.0 1547765.0 1482554.0 1912212.0 1654430.0 1642580.0 ... 20.5 19.9 19.2 18.7 20.90 20.50 20.45 19.73 18.70 18.70
2022-12-29 40.5 43.0 17.8 28.1 0.0 1547765.0 1482554.0 1912212.0 1654430.0 1654430.0 ... 20.5 19.9 11.7 18.7 20.90 20.60 20.45 19.73 19.55 18.70
2022-12-30 40.5 43.0 17.8 28.1 0.0 1547765.0 1482554.0 1482554.0 1912212.0 1654430.0 ... 20.5 19.9 11.6 18.7 21.00 20.50 20.45 19.73 19.55 18.70
2022-12-31 40.5 43.0 17.8 28.1 0.0 1592549.0 1547765.0 1482554.0 1912212.0 1654430.0 ... 20.5 19.9 11.5 18.7 21.10 20.60 20.45 19.73 19.35 18.70

1645 rows × 83 columns

In [ ]:
"""
Highly relative feature list: (not extended)
DuckEggPrice            0.726912
NumRuChung              0.435494
Corn                    0.592012
SoyBeanFlour            0.635711
CornFlour               0.620423
DehulledBeanFlour       0.547041
HighProteinBeanFlour    0.639172
"""
Out[ ]:
'\nHighly relative feature list: (not extended)\nDuckEggPrice            0.726912\nNumRuChung              0.435494\nCorn                    0.592012\nSoyBeanFlour            0.635711\nCornFlour               0.620423\nDehulledBeanFlour       0.547041\nHighProteinBeanFlour    0.639172\n'
In [ ]:
from ML.preprocessing import feature_selection

df_shift_fs, selectedFeatures, targetCorr = feature_selection(df_shift, type='corr')
df_shift_fs
Number of selected features: 31 

Highly relative feature list:
DuckEggPrice              0.731987
Corn_1                    0.583691
Corn_2                    0.625401
Corn_3                    0.631921
Corn_4                    0.660566
Corn_5                    0.671084
Corn_6                    0.670482
SoyBeanFlour_1            0.591983
SoyBeanFlour_2            0.613008
SoyBeanFlour_3            0.590756
SoyBeanFlour_4            0.570220
SoyBeanFlour_5            0.551340
SoyBeanFlour_6            0.529676
CornFlour_1               0.606901
CornFlour_2               0.640753
CornFlour_3               0.657381
CornFlour_4               0.683974
CornFlour_5               0.684478
CornFlour_6               0.682916
DehulledBeanFlour_1       0.524395
DehulledBeanFlour_2       0.555826
DehulledBeanFlour_3       0.582212
DehulledBeanFlour_4       0.608746
DehulledBeanFlour_5       0.639264
DehulledBeanFlour_6       0.669838
HighProteinBeanFlour_1    0.591559
HighProteinBeanFlour_2    0.615984
HighProteinBeanFlour_3    0.591543
HighProteinBeanFlour_4    0.567360
HighProteinBeanFlour_5    0.544886
HighProteinBeanFlour_6    0.521926
Name: EggPrice, dtype: float64
Out[ ]:
DuckEggPrice Corn_1 Corn_2 Corn_3 Corn_4 Corn_5 Corn_6 SoyBeanFlour_1 SoyBeanFlour_2 SoyBeanFlour_3 ... DehulledBeanFlour_4 DehulledBeanFlour_5 DehulledBeanFlour_6 HighProteinBeanFlour_1 HighProteinBeanFlour_2 HighProteinBeanFlour_3 HighProteinBeanFlour_4 HighProteinBeanFlour_5 HighProteinBeanFlour_6 EggPrice
Date
2018-07-01 32.0 7.30 7.05 8.05 6.90 6.23 6.10 13.80 13.9 13.7 ... 14.4 13.4 13.3 14.75 14.76 14.70 14.16 13.38 13.21 27.5
2018-07-02 32.0 7.30 7.15 8.05 7.05 6.23 6.10 13.80 13.9 13.7 ... 14.5 13.4 13.3 14.75 14.73 14.67 14.25 13.35 13.21 27.5
2018-07-03 32.0 7.30 7.30 8.05 7.05 6.23 6.10 13.80 13.9 13.7 ... 14.5 13.4 13.3 14.75 14.78 14.67 14.25 13.35 13.21 27.5
2018-07-04 34.0 7.30 7.30 8.05 7.10 6.23 6.10 13.80 13.9 13.7 ... 14.5 13.4 13.3 14.75 14.78 14.67 14.40 13.33 13.20 27.5
2018-07-05 34.0 7.30 7.30 8.05 7.15 6.30 6.10 13.80 13.7 13.7 ... 14.5 13.4 13.3 14.75 14.70 14.67 14.40 13.33 13.20 27.5
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2022-12-27 43.0 11.65 11.80 11.55 11.45 10.85 12.35 19.80 19.3 19.3 ... 19.9 19.2 23.6 20.90 20.45 20.45 19.73 18.70 18.70 40.5
2022-12-28 43.0 11.65 11.80 11.55 11.50 10.85 12.30 19.80 19.5 19.3 ... 19.9 19.2 18.7 20.90 20.50 20.45 19.73 18.70 18.70 40.5
2022-12-29 43.0 11.65 11.80 11.55 11.40 10.75 12.30 19.80 19.5 19.3 ... 19.9 11.7 18.7 20.90 20.60 20.45 19.73 19.55 18.70 40.5
2022-12-30 43.0 11.55 11.80 11.55 11.40 10.65 12.20 19.80 19.5 19.3 ... 19.9 11.6 18.7 21.00 20.50 20.45 19.73 19.55 18.70 40.5
2022-12-31 43.0 11.50 11.80 11.60 11.35 10.55 12.20 19.85 19.5 19.3 ... 19.9 11.5 18.7 21.10 20.60 20.45 19.73 19.35 18.70 40.5

1645 rows × 32 columns

In [ ]:
import matplotlib.pyplot as plt

month_before = 1
data = []

for col in extended_columns:
    if month_before <= 6:
        data.append((month_before, targetCorr[col]))
        month_before += 1

        if month_before == 7:
            # plot bar chart
            keys, values = zip(*data)

            plt.bar(keys, values)
            plt.xlabel('Months Before')
            plt.ylabel('Correlation to Y')
            plt.title(col[:-2])
            plt.savefig(f'./tempfigs/{col[:-2]}.png')

            plt.show()

            # reset
            data = []
            month_before = 1